from bs4 import BeautifulSoup as bs
import re, io, requests
from time import sleep
from random import random

def download_pdf(save_path, pdf_url, pdf_name=None):
    send_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8"}

    response = requests.get(url=pdf_url, headers=send_headers)
    bytes_io = io.BytesIO(response.content)
    pdf_name = pdf_url.replace('https://openaccess.thecvf.com/content/CVPR2023/papers/', '')
    with open(f'{save_path}/{pdf_name}', mode='wb') as f:
        f.write(bytes_io.getvalue())
        print(f'Finish :: {pdf_url}')

html_page = open('./CVPR 2023 Open Access Repository.html', 'r', encoding='utf-8').read()
soup = bs(html_page, 'lxml')

title_pdfs = soup.find_all('a')
title_pdfs = [p for p in title_pdfs if 'paper' in str(p) or 'html' in str(p)]

# 提取论文名称和 pdf文件 url
paper_pair = []
pattern = re.compile('href="([\s\S]+?)">')
for i in range(0, len(title_pdfs), 2):
    paper_title = title_pdfs[i].contents[0]
    pdf_url = re.findall(pattern, str(title_pdfs[i+1]))[0]
    pair = (paper_title,pdf_url)
    paper_pair.append(pair)

f = open('fail_download.csv', 'w+', encoding='utf-8')
for i, pair in enumerate(paper_pair):
    try:
        download_pdf('papers', pair[1])
    except:
        print('Fail to download. url= ', pair[1])
        f.write(f'{pair[0]};{pair[1]}')
    print(f'{i}/{len(paper_pair)} sleeping now, and downloading will start within 5 s')
    sleep(random()*15)
f.close()